#imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score
import shap
%matplotlib inline
Dataset reference:
data = pd.read_csv('src/kc_house_data.csv')
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 21613 entries, 0 to 21612 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 21613 non-null int64 1 date 21613 non-null object 2 price 21613 non-null float64 3 bedrooms 21613 non-null int64 4 bathrooms 21613 non-null float64 5 sqft_living 21613 non-null int64 6 sqft_lot 21613 non-null int64 7 floors 21613 non-null float64 8 waterfront 21613 non-null int64 9 view 21613 non-null int64 10 condition 21613 non-null int64 11 grade 21613 non-null int64 12 sqft_above 21613 non-null int64 13 sqft_basement 21613 non-null int64 14 yr_built 21613 non-null int64 15 yr_renovated 21613 non-null int64 16 zipcode 21613 non-null int64 17 lat 21613 non-null float64 18 long 21613 non-null float64 19 sqft_living15 21613 non-null int64 20 sqft_lot15 21613 non-null int64 dtypes: float64(5), int64(15), object(1) memory usage: 3.5+ MB
data.head()
| id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | ... | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7129300520 | 20141013T000000 | 221900.0 | 3 | 1.00 | 1180 | 5650 | 1.0 | 0 | 0 | ... | 7 | 1180 | 0 | 1955 | 0 | 98178 | 47.5112 | -122.257 | 1340 | 5650 |
| 1 | 6414100192 | 20141209T000000 | 538000.0 | 3 | 2.25 | 2570 | 7242 | 2.0 | 0 | 0 | ... | 7 | 2170 | 400 | 1951 | 1991 | 98125 | 47.7210 | -122.319 | 1690 | 7639 |
| 2 | 5631500400 | 20150225T000000 | 180000.0 | 2 | 1.00 | 770 | 10000 | 1.0 | 0 | 0 | ... | 6 | 770 | 0 | 1933 | 0 | 98028 | 47.7379 | -122.233 | 2720 | 8062 |
| 3 | 2487200875 | 20141209T000000 | 604000.0 | 4 | 3.00 | 1960 | 5000 | 1.0 | 0 | 0 | ... | 7 | 1050 | 910 | 1965 | 0 | 98136 | 47.5208 | -122.393 | 1360 | 5000 |
| 4 | 1954400510 | 20150218T000000 | 510000.0 | 3 | 2.00 | 1680 | 8080 | 1.0 | 0 | 0 | ... | 8 | 1680 | 0 | 1987 | 0 | 98074 | 47.6168 | -122.045 | 1800 | 7503 |
5 rows × 21 columns
# drop date because it is irrelevant for our findings
data = data.drop(['date', 'id'],axis=1)
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) /var/folders/75/5mh9hp1s1xn2ypzg82bcvlnr0000gn/T/ipykernel_16086/3295968318.py in <module> 1 # drop date because it is irrelevant for our findings ----> 2 data = data.drop(['date', 'id'],axis=1) ~/opt/anaconda3/lib/python3.9/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs) 309 stacklevel=stacklevel, 310 ) --> 311 return func(*args, **kwargs) 312 313 return wrapper ~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/frame.py in drop(self, labels, axis, index, columns, level, inplace, errors) 4904 weight 1.0 0.8 4905 """ -> 4906 return super().drop( 4907 labels=labels, 4908 axis=axis, ~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/generic.py in drop(self, labels, axis, index, columns, level, inplace, errors) 4148 for axis, labels in axes.items(): 4149 if labels is not None: -> 4150 obj = obj._drop_axis(labels, axis, level=level, errors=errors) 4151 4152 if inplace: ~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/generic.py in _drop_axis(self, labels, axis, level, errors) 4183 new_axis = axis.drop(labels, level=level, errors=errors) 4184 else: -> 4185 new_axis = axis.drop(labels, errors=errors) 4186 result = self.reindex(**{axis_name: new_axis}) 4187 ~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexes/base.py in drop(self, labels, errors) 6015 if mask.any(): 6016 if errors != "ignore": -> 6017 raise KeyError(f"{labels[mask]} not found in axis") 6018 indexer = indexer[~mask] 6019 return self.delete(indexer) KeyError: "['date' 'id'] not found in axis"
# Getting rid of price outliers
data = data.drop(data[data.price > 3887500.0].index) # rid of 12 incredibly high outliers
# Getting rid of bathroom outliers
data = data.drop(data[data.bedrooms > 15].index)
# Since most of the yr_renovated is has 0 values,
# the data will be transformed to boolean (yr_renovated to renovated)
data['yr_renovated'] = pd.Series(np.where(data.yr_renovated.values > 0, 1, 0),
data.index)
data.rename(columns = {'yr_renovated':'renovated'}, inplace = True)
# Check out for outliers by printing histograms of all features
for feature in data.columns:
print(feature)
sns.histplot(data[feature])
plt.show()
print(data[feature].value_counts(bins=10, sort=False))
print('Min', min(data[feature]))
print('Max', max(data[feature]))
print('\n\n')
price
(71224.999, 452500.0] 10912 (452500.0, 830000.0] 8054 (830000.0, 1207500.0] 1675 (1207500.0, 1585000.0] 520 (1585000.0, 1962500.0] 234 (1962500.0, 2340000.0] 86 (2340000.0, 2717500.0] 58 (2717500.0, 3095000.0] 31 (3095000.0, 3472500.0] 20 (3472500.0, 3850000.0] 10 Name: price, dtype: int64 Min 75000.0 Max 3850000.0 bedrooms
(-0.012, 1.1] 212 (1.1, 2.2] 2760 (2.2, 3.3] 9824 (3.3, 4.4] 6880 (4.4, 5.5] 1594 (5.5, 6.6] 269 (6.6, 7.7] 38 (7.7, 8.8] 13 (8.8, 9.9] 6 (9.9, 11.0] 4 Name: bedrooms, dtype: int64 Min 0 Max 11 bathrooms
(-0.009000000000000001, 0.8] 86 (0.8, 1.6] 5307 (1.6, 2.4] 7024 (2.4, 3.2] 7317 (3.2, 4.0] 1611 (4.0, 4.8] 201 (4.8, 5.6] 40 (5.6, 6.4] 9 (6.4, 7.2] 3 (7.2, 8.0] 2 Name: bathrooms, dtype: int64 Min 0.0 Max 8.0 sqft_living
(276.749, 1615.0] 7570 (1615.0, 2940.0] 10713 (2940.0, 4265.0] 2780 (4265.0, 5590.0] 441 (5590.0, 6915.0] 75 (6915.0, 8240.0] 19 (8240.0, 9565.0] 1 (9565.0, 10890.0] 0 (10890.0, 12215.0] 0 (12215.0, 13540.0] 1 Name: sqft_living, dtype: int64 Min 290 Max 13540 sqft_lot
(-1130.84, 165603.9] 21290 (165603.9, 330687.8] 251 (330687.8, 495771.7] 37 (495771.7, 660855.6] 10 (660855.6, 825939.5] 1 (825939.5, 991023.4] 7 (991023.4, 1156107.3] 2 (1156107.3, 1321191.2] 1 (1321191.2, 1486275.1] 0 (1486275.1, 1651359.0] 1 Name: sqft_lot, dtype: int64 Min 520 Max 1651359 floors
(0.997, 1.25] 10678 (1.25, 1.5] 1910 (1.5, 1.75] 0 (1.75, 2.0] 8231 (2.0, 2.25] 0 (2.25, 2.5] 160 (2.5, 2.75] 0 (2.75, 3.0] 613 (3.0, 3.25] 0 (3.25, 3.5] 8 Name: floors, dtype: int64 Min 1.0 Max 3.5 waterfront
(-0.002, 0.1] 21442 (0.1, 0.2] 0 (0.2, 0.3] 0 (0.3, 0.4] 0 (0.4, 0.5] 0 (0.5, 0.6] 0 (0.6, 0.7] 0 (0.7, 0.8] 0 (0.8, 0.9] 0 (0.9, 1.0] 158 Name: waterfront, dtype: int64 Min 0 Max 1 view
(-0.005, 0.4] 19484 (0.4, 0.8] 0 (0.8, 1.2] 332 (1.2, 1.6] 0 (1.6, 2.0] 962 (2.0, 2.4] 0 (2.4, 2.8] 0 (2.8, 3.2] 509 (3.2, 3.6] 0 (3.6, 4.0] 313 Name: view, dtype: int64 Min 0 Max 4 condition
(0.995, 1.4] 30 (1.4, 1.8] 0 (1.8, 2.2] 172 (2.2, 2.6] 0 (2.6, 3.0] 14021 (3.0, 3.4] 0 (3.4, 3.8] 0 (3.8, 4.2] 5677 (4.2, 4.6] 0 (4.6, 5.0] 1700 Name: condition, dtype: int64 Min 1 Max 5 grade
(0.987, 2.2] 1 (2.2, 3.4] 3 (3.4, 4.6] 29 (4.6, 5.8] 242 (5.8, 7.0] 11018 (7.0, 8.2] 6068 (8.2, 9.4] 2615 (9.4, 10.6] 1134 (10.6, 11.8] 398 (11.8, 13.0] 92 Name: grade, dtype: int64 Min 1 Max 13 sqft_above
(280.879, 1202.0] 5620 (1202.0, 2114.0] 9988 (2114.0, 3026.0] 4065 (3026.0, 3938.0] 1483 (3938.0, 4850.0] 351 (4850.0, 5762.0] 66 (5762.0, 6674.0] 21 (6674.0, 7586.0] 2 (7586.0, 8498.0] 3 (8498.0, 9410.0] 1 Name: sqft_above, dtype: int64 Min 290 Max 9410 sqft_basement
(-4.131, 413.0] 15006 (413.0, 826.0] 3427 (826.0, 1239.0] 2256 (1239.0, 1652.0] 697 (1652.0, 2065.0] 159 (2065.0, 2478.0] 38 (2478.0, 2891.0] 14 (2891.0, 3304.0] 1 (3304.0, 3717.0] 1 (3717.0, 4130.0] 1 Name: sqft_basement, dtype: int64 Min 0 Max 4130 yr_built
(1899.884, 1911.5] 851 (1911.5, 1923.0] 952 (1923.0, 1934.5] 1079 (1934.5, 1946.0] 1360 (1946.0, 1957.5] 2586 (1957.5, 1969.0] 3218 (1969.0, 1980.5] 2525 (1980.5, 1992.0] 2782 (1992.0, 2003.5] 2658 (2003.5, 2015.0] 3589 Name: yr_built, dtype: int64 Min 1900 Max 2015 renovated
(-0.002, 0.1] 20689 (0.1, 0.2] 0 (0.2, 0.3] 0 (0.3, 0.4] 0 (0.4, 0.5] 0 (0.5, 0.6] 0 (0.6, 0.7] 0 (0.7, 0.8] 0 (0.8, 0.9] 0 (0.9, 1.0] 911 Name: renovated, dtype: int64 Min 0 Max 1 zipcode
(98000.80099999999, 98020.8] 2853 (98020.8, 98040.6] 4378 (98040.6, 98060.4] 3345 (98060.4, 98080.2] 1699 (98080.2, 98100.0] 351 (98100.0, 98119.8] 4257 (98119.8, 98139.6] 1811 (98139.6, 98159.4] 1133 (98159.4, 98179.2] 1040 (98179.2, 98199.0] 733 Name: zipcode, dtype: int64 Min 98001 Max 98199 lat
(47.154, 47.218] 181 (47.218, 47.28] 329 (47.28, 47.342] 1428 (47.342, 47.405] 1930 (47.405, 47.467] 1432 (47.467, 47.529] 2584 (47.529, 47.591] 3863 (47.591, 47.653] 2997 (47.653, 47.715] 4019 (47.715, 47.778] 2837 Name: lat, dtype: int64 Min 47.1559 Max 47.7776 long
(-122.521, -122.399] 419 (-122.399, -122.278] 8829 (-122.278, -122.158] 5634 (-122.158, -122.037] 3908 (-122.037, -121.917] 2091 (-121.917, -121.797] 483 (-121.797, -121.676] 217 (-121.676, -121.556] 2 (-121.556, -121.435] 2 (-121.435, -121.315] 15 Name: long, dtype: int64 Min -122.519 Max -121.315 sqft_living15
(393.18800000000005, 980.1] 312 (980.1, 1561.2] 6439 (1561.2, 2142.3] 7587 (2142.3, 2723.4] 4200 (2723.4, 3304.5] 2001 (3304.5, 3885.6] 728 (3885.6, 4466.7] 232 (4466.7, 5047.8] 81 (5047.8, 5628.9] 12 (5628.9, 6210.0] 8 Name: sqft_living15, dtype: int64 Min 399 Max 6210 sqft_lot15
(-219.55, 87705.9] 21203 (87705.9, 174760.8] 196 (174760.8, 261815.7] 166 (261815.7, 348870.6] 21 (348870.6, 435925.5] 10 (435925.5, 522980.4] 1 (522980.4, 610035.3] 1 (610035.3, 697090.2] 0 (697090.2, 784145.1] 0 (784145.1, 871200.0] 2 Name: sqft_lot15, dtype: int64 Min 651 Max 871200
data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 21600 entries, 0 to 21612 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 price 21600 non-null float64 1 bedrooms 21600 non-null int64 2 bathrooms 21600 non-null float64 3 sqft_living 21600 non-null int64 4 sqft_lot 21600 non-null int64 5 floors 21600 non-null float64 6 waterfront 21600 non-null int64 7 view 21600 non-null int64 8 condition 21600 non-null int64 9 grade 21600 non-null int64 10 sqft_above 21600 non-null int64 11 sqft_basement 21600 non-null int64 12 yr_built 21600 non-null int64 13 renovated 21600 non-null int64 14 zipcode 21600 non-null int64 15 lat 21600 non-null float64 16 long 21600 non-null float64 17 sqft_living15 21600 non-null int64 18 sqft_lot15 21600 non-null int64 dtypes: float64(5), int64(14) memory usage: 3.3 MB
Pearson correlation matrix
We use the Pearson correlation coefficient to examine the strength and direction of the linear relationship between two continuous variables.
The correlation coefficient can range in value from −1 to +1. The larger the absolute value of the coefficient, the stronger the relationship between the variables. For the Pearson correlation, an absolute value of 1 indicates a perfect linear relationship. A correlation close to 0 indicates no linear relationship between the variables.
The sign of the coefficient indicates the direction of the relationship. If both variables tend to increase or decrease together, the coefficient is positive, and the line that represents the correlation slopes upward. If one variable tends to increase as the other decreases, the coefficient is negative, and the line that represents the correlation slopes downward.
sns.set(style="whitegrid", font_scale=1)
plt.figure(figsize=(13,13))
plt.title('Pearson Correlation Matrix',fontsize=25)
sns.heatmap(data.corr(),linewidths=0.25,vmax=0.7,square=True,cmap="GnBu",
linecolor='w',annot=True, annot_kws={"size":7},
cbar_kws={"shrink": .7})
<AxesSubplot:title={'center':'Pearson Correlation Matrix'}>
Which features are more correlated to the price?
price_corr = data.corr()['price'].sort_values(ascending=False)
print(price_corr)
price 1.000000 sqft_living 0.694332 grade 0.677461 sqft_above 0.598753 sqft_living15 0.597792 bathrooms 0.520003 view 0.397511 lat 0.320394 bedrooms 0.317871 sqft_basement 0.312569 floors 0.264089 waterfront 0.248897 renovated 0.123182 sqft_lot 0.091994 sqft_lot15 0.084420 yr_built 0.054103 condition 0.040820 long 0.024028 zipcode -0.051128 Name: price, dtype: float64
Discovered various outlier samples that have incredibly large square foot basements, while more than half the dataset tends to not even have a basement. Feature should be cleaned
plot = sns.boxplot(data['sqft_basement'])
Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
# check distribution grouped by bins
data['sqft_basement'].value_counts(bins=20, sort=False)
(-4.131, 206.5] 13691 (206.5, 413.0] 1315 (413.0, 619.5] 1677 (619.5, 826.0] 1750 (826.0, 1032.5] 1477 (1032.5, 1239.0] 779 (1239.0, 1445.5] 477 (1445.5, 1652.0] 220 (1652.0, 1858.5] 109 (1858.5, 2065.0] 50 (2065.0, 2271.5] 32 (2271.5, 2478.0] 6 (2478.0, 2684.5] 10 (2684.5, 2891.0] 4 (2891.0, 3097.5] 0 (3097.5, 3304.0] 1 (3304.0, 3510.5] 1 (3510.5, 3717.0] 0 (3717.0, 3923.5] 0 (3923.5, 4130.0] 1 Name: sqft_basement, dtype: int64
sns.histplot(data['sqft_basement'])
<AxesSubplot:xlabel='sqft_basement', ylabel='Count'>
# print an overview of the data relation with every pair of features
sns.pairplot(data)
<seaborn.axisgrid.PairGrid at 0x7fc7604c55b0>
as longitud tends to decrease, that is go west, in King county, there appears to be more houses, and particularly houses with higher price. Similarly with the latitud as it increases, that means, as the house is more to the north, there are more houses with high price in the King County area in washington state.
sns.scatterplot(data['long'], data['price'])
Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
<AxesSubplot:xlabel='long', ylabel='price'>
sns.scatterplot(data['lat'], data['price'])
Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
<AxesSubplot:xlabel='lat', ylabel='price'>
X = data.drop('price', axis=1)
y = data['price']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,
random_state=0)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
(15120, 18) (6480, 18) (15120,) (6480,)
params = {'iterations':10000,
'learning_rate':0.01,
'depth':3,
'loss_function':'RMSE',
'eval_metric':'RMSE',
'random_seed':55,
#'cat_features':boston_categories,
'metric_period':200,
'od_type':"Iter",
'od_wait':20,
'verbose':True,
'use_best_model':True}
model = CatBoostRegressor(**params)
model.fit(X_train, y_train,
eval_set=(X_test, y_test),
use_best_model=True,
plot=True,
verbose=True)
Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.
0: learn: 345891.9480508 test: 345907.7598494 best: 345907.7598494 (0) total: 65.9ms remaining: 10m 59s 200: learn: 189988.8717602 test: 190680.6683071 best: 190680.6683071 (200) total: 454ms remaining: 22.1s 400: learn: 154776.2387828 test: 155286.2193118 best: 155286.2193118 (400) total: 795ms remaining: 19s 600: learn: 140133.9061283 test: 141310.5918705 best: 141310.5918705 (600) total: 1.16s remaining: 18.1s 800: learn: 132210.0940179 test: 134144.6337262 best: 134144.6337262 (800) total: 1.5s remaining: 17.3s 1000: learn: 127181.4633115 test: 129958.1415207 best: 129958.1415207 (1000) total: 1.85s remaining: 16.6s 1200: learn: 123359.3094657 test: 127261.7334717 best: 127261.7334717 (1200) total: 2.19s remaining: 16.1s 1400: learn: 120007.6008026 test: 125203.4198809 best: 125203.4198809 (1400) total: 2.51s remaining: 15.4s 1600: learn: 117386.9432710 test: 123751.2293790 best: 123751.2293790 (1600) total: 2.83s remaining: 14.9s 1800: learn: 115380.6178454 test: 122672.3821423 best: 122667.8901137 (1799) total: 3.21s remaining: 14.6s 2000: learn: 113579.9670289 test: 121642.4625666 best: 121642.4625666 (2000) total: 3.56s remaining: 14.2s 2200: learn: 111892.9979306 test: 120710.9809830 best: 120710.9809830 (2200) total: 3.9s remaining: 13.8s 2400: learn: 110331.6577205 test: 119856.9197229 best: 119851.6829093 (2398) total: 4.24s remaining: 13.4s 2600: learn: 109038.3602758 test: 119128.3665563 best: 119128.3665563 (2600) total: 4.58s remaining: 13s 2800: learn: 107639.0531884 test: 118289.9762116 best: 118289.9762116 (2800) total: 4.95s remaining: 12.7s 3000: learn: 106401.8630716 test: 117658.9152103 best: 117658.9152103 (3000) total: 5.3s remaining: 12.4s 3200: learn: 105287.0977579 test: 117151.8333360 best: 117151.8333360 (3200) total: 5.64s remaining: 12s 3400: learn: 104185.4108282 test: 116649.4155660 best: 116649.4155660 (3400) total: 5.99s remaining: 11.6s 3600: learn: 103183.0309551 test: 116192.1404032 best: 116192.1404032 (3600) total: 6.33s remaining: 11.2s 3800: learn: 102230.0496211 test: 115776.5596362 best: 115776.5596362 (3800) total: 6.67s remaining: 10.9s 4000: learn: 101337.7945761 test: 115441.5667786 best: 115441.5667786 (4000) total: 7s remaining: 10.5s 4200: learn: 100510.7079492 test: 115090.2869713 best: 115089.8052618 (4199) total: 7.35s remaining: 10.1s 4400: learn: 99721.1818495 test: 114771.7882240 best: 114768.3722578 (4396) total: 7.69s remaining: 9.78s 4600: learn: 99013.3172802 test: 114476.7330777 best: 114476.5943900 (4598) total: 8.03s remaining: 9.42s 4800: learn: 98315.0535317 test: 114225.0593788 best: 114222.4329617 (4792) total: 8.37s remaining: 9.07s 5000: learn: 97633.1803013 test: 113977.8426713 best: 113977.4221000 (4999) total: 8.72s remaining: 8.71s 5200: learn: 97000.2964890 test: 113755.0302323 best: 113754.3961384 (5199) total: 9.05s remaining: 8.35s 5400: learn: 96405.5209028 test: 113565.1380152 best: 113565.1380152 (5400) total: 9.4s remaining: 8.01s 5600: learn: 95851.6716772 test: 113355.8417090 best: 113355.8417090 (5600) total: 9.75s remaining: 7.65s 5800: learn: 95294.3907150 test: 113116.3388004 best: 113115.2411821 (5798) total: 10.1s remaining: 7.3s 6000: learn: 94734.5256658 test: 112817.6002707 best: 112817.6002707 (6000) total: 10.4s remaining: 6.95s 6200: learn: 94238.1977740 test: 112568.6692028 best: 112568.6692028 (6200) total: 10.8s remaining: 6.59s 6400: learn: 93709.2265475 test: 112312.6645411 best: 112312.6645411 (6400) total: 11.1s remaining: 6.24s 6600: learn: 93252.8478167 test: 112132.3237084 best: 112131.0433792 (6593) total: 11.4s remaining: 5.89s 6800: learn: 92783.6693648 test: 111906.9067528 best: 111906.0147754 (6797) total: 11.8s remaining: 5.54s 7000: learn: 92322.1106012 test: 111781.4955484 best: 111775.1585714 (6980) total: 12.1s remaining: 5.19s Stopped by overfitting detector (20 iterations wait) bestTest = 111775.1586 bestIteration = 6980 Shrink model to first 6981 iterations.
<catboost.core.CatBoostRegressor at 0x7fcc872d8fa0>
$R^2$: compares models prediction to the mean of the targets. that is the expected value
# predictions on the test set
y_pred = model.predict(X_test)
print('MAE: ',mean_absolute_error(y_test,y_pred)) # sum(abs(y_test - y_pred) / len(y_test))
print('MSE: ',mean_squared_error(y_test,y_pred)) # sum((y_test - y_pred)**2 / len(y_test))
print('RMSE: ',np.sqrt(mean_squared_error(y_test,y_pred)))
print('R^2 score', model.score(X_test,y_test)) # Calculates r^2
MAE: 67378.75256223505 MSE: 12493686073.656778 RMSE: 111775.15857137837 R^2 score 0.8967156311766572
$\phi_i \rightarrow$ Shapley value for feature $i$ (e.g.: {Age})
$f \rightarrow$ Black Box Model
$x \rightarrow$ Input data point
$z' \rightarrow$ Subset (e.g.: {Age, BMI})
$x' \rightarrow$ Simplified data input
Using a mapping function we transform $x \rightarrow x'$
$z'\subseteq x' \rightarrow$ Iterate over all possible combinations of features
$f_x(z') \rightarrow$ Black Box Model output for our input with the feature(s) we are interested in (e.g.:{Age, BMI})
$f_x(z' \backslash i) \rightarrow$ Black Box Model output for our input without the feature(s) we are interested in (e.g.:{BMI})
$[f_x(z') - f_x(z' \backslash i)] \rightarrow$ Tells us how feature $i$ contributed to that subset. Also called the marginal value
$M \rightarrow$ Total number of features
$\frac{|z'|!(M - |z'| - 1)!}{M!} \rightarrow$ Weighting according to how many players are in that correlation
$$\phi_i(f,x) = \sum_{z'\subseteq x'} \frac{|z'|!(M - |z'| - 1)!}{M!} [f_x(z') - f_x(z' \backslash i)]$$
from itertools import combinations
# amount of coalitions is (2^features)-1
def get_coalitions(sample, feature):
rest_features = []
# get the rest of the features
for feat in sample.index:
if feat != feature:
rest_features.append(feat)
# get all possible coalitions for every feature
coalition_list = []
for feat_num in range(len(x.index)):
for coalition in combinations(rest_features,feat_num):
coalition_list.append(list(coalition))
return coalition_list
def get_val(model, background, sample, coalition):
# get coalition feature values and map them to a dictionary
coalition_features = {c: sample[c] for c in coalition}
# assign all coalition values, to whole dataset, predict,
# and get mean()
return model.predict(background.assign(**coalition_features)).mean()
import scipy
from math import factorial
# val(S U i) - val(S)
def get_contributions(model, background, sample, feat, coalition):
# val(S U i)
val_s_i = get_val(model, background, sample, coalition + [feat])
val_s = get_val(model, background, sample, coalition)
## get worth of coalition
val = val_s_i - val_s
## get number of coalitions
z = len(coalition) # number of present features
M = len(sample.index) # number of total features
num_coalitions = (factorial(z)*factorial(M - z - 1)) / factorial(M)
## result
return num_coalitions * val
# def coalition_contribution(model, X_train, x, col, coalition):
# marginal_gain = get_val(model, X_train, x, coalition + [col]) - get_val(model, X_train, x, coalition)
# num_coalitions = 1 / (scipy.special.comb(len(x) - 1, len(coalition)) * len(x))
# return num_coalitions * marginal_gain
# print('good', coalition_contribution(model, X_train, X.iloc[0:1], ['floors'], ['bathrooms']))
def calculateSHAP(model, background, X):
shap_values = []
#for every sample
for i in range(np.shape(X)[0]):
sample = X.iloc[i]
# 1. calculate coalitions\
# for every feature
shap_vals_for_x = []
for feat in X.columns:
coalitions = get_coalitions(sample, feat)
print(1)
contributions = []
for coalition in coalitions:
#print(4)
contributions.append(get_contributions(model,
background,
sample,
feat,
coalition))
print(2)
shap_val = np.sum(contributions)
shap_vals_for_x.append(shap_val)
shap_values.append(shap_vals_for_x)
print('feature done ', i)
phi0 = np.average(model.predict(background))
return phi0, shap_values
explainer = shap.Explainer(model.predict, X_train)
shap_values = explainer(X.iloc[0:1])
X_train = X_train[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot']]
X_test = X_test[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot']]
model.fit(X_train, y_train,
eval_set=(X_test, y_test),
use_best_model=True,
plot=True,
verbose=True)
Custom logger is already specified. Specify more than one logger at same time is not thread safe.
Warning: Overfitting detector is active, thus evaluation metric is calculated on every iteration. 'metric_period' is ignored for evaluation metric.
0: learn: 346240.5472848 test: 346245.6459987 best: 346245.6459987 (0) total: 4.06ms remaining: 40.6s 200: learn: 244950.9176752 test: 246191.8338167 best: 246191.8338167 (200) total: 314ms remaining: 15.3s 400: learn: 236410.3781219 test: 238969.7724612 best: 238969.7724612 (400) total: 619ms remaining: 14.8s 600: learn: 233077.2662404 test: 236673.2360873 best: 236673.2360873 (600) total: 932ms remaining: 14.6s 800: learn: 230981.6648494 test: 235469.8422829 best: 235469.8422829 (800) total: 1.26s remaining: 14.4s 1000: learn: 229458.6218974 test: 234757.4685653 best: 234757.4685653 (1000) total: 1.56s remaining: 14.1s 1200: learn: 228309.3816388 test: 234302.1611602 best: 234295.8773599 (1193) total: 1.87s remaining: 13.7s 1400: learn: 227275.1605411 test: 233762.1118315 best: 233762.1118315 (1400) total: 2.17s remaining: 13.3s 1600: learn: 226459.5291490 test: 233429.8827411 best: 233429.8827411 (1600) total: 2.5s remaining: 13.1s Stopped by overfitting detector (20 iterations wait) bestTest = 233362.5671 bestIteration = 1654 Shrink model to first 1655 iterations.
<catboost.core.CatBoostRegressor at 0x7fcc872d8fa0>
def coalition_worth(model, X_train, x, coalition):
coalition_features = {c: x[c] for c in coalition}
return model.predict(X_train.assign(**coalition_features)).mean()
def coalitions(x, col):
remaining_features = [feature for feature in x.index if feature != col]
for feature in range(len(remaining_features) + 1):
for coalition in combinations(remaining_features, feature):
yield list(coalition)
def coalition_contribution(model, X_train, x, col, coalition):
marginal_gain = coalition_worth(model, X_train, x, coalition + [col]) - coalition_worth(model, X_train, x, coalition)
num_coalitions = 1 / (scipy.special.comb(len(x) - 1, len(coalition)) * len(x))
return num_coalitions * marginal_gain
def calculate_exact_shap_values(model, X_train, X_sample):
if isinstance(X_sample, pd.Series):
X_sample = pd.DataFrame(X_sample).T
shap_values_list = []
for i in range(X_sample.shape[0]):
x = X_sample.iloc[i]
shap_values = []
for col in X_train.columns:
shap_value = np.sum([coalition_contribution(model, X_train, x, col, coalition) for coalition in coalitions(x, col)])
shap_values.append(shap_value)
shap_values_list.append(shap_values)
print('feature finished', i)
phi0 = np.average(model.predict(X_train))
return phi0, shap_values_list
calculate_exact_shap_values(model, X_train.iloc[0:10], X.iloc[0])
feature finished 0
(476313.84119890817, [[4475.584453349034, 9320.233133025104, -162224.169014538, 35598.95174850948]])
calculateSHAP(model, X_train.iloc[0:10], X.iloc[0:1])
1 2 1 2 1 2 1 2 feature done 0
(476313.84119890817, [[4475.584453349034, 9320.233133025104, -162224.169014538, 35598.95174850948]])
X_train = X_train[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot']]
X_train = X_train[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot']]
X = X[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot']]
X_train
| bedrooms | bathrooms | sqft_living | sqft_lot | |
|---|---|---|---|---|
| 6351 | 3 | 2.50 | 1970 | 6291 |
| 8238 | 3 | 1.50 | 2230 | 6000 |
| 19251 | 5 | 2.50 | 2990 | 7292 |
| 4307 | 3 | 1.00 | 1130 | 7200 |
| 6360 | 3 | 2.50 | 3060 | 7831 |
| ... | ... | ... | ... | ... |
| 13135 | 4 | 2.50 | 2450 | 5079 |
| 19661 | 3 | 2.50 | 1650 | 1793 |
| 9856 | 3 | 1.75 | 1460 | 12367 |
| 10810 | 3 | 1.00 | 1030 | 8395 |
| 2736 | 3 | 1.75 | 1310 | 18400 |
15120 rows × 4 columns
shap_values
.values =
array([[ 1176.72304471, -11652.07931896, -80612.04994778,
-7140.81036387, 6051.33455908, -1961.08864735,
-8085.30680057, -8853.88935721, -51810.33454926,
-3627.74886706, 5709.54991419, 4656.60319149,
-1102.35000946, -26616.71562021, -77728.02089429,
2596.11671592, -32187.66915759, -1688.80031901]])
.base_values =
array([513418.88552921])
.data =
array([[ 3.00000e+00, 1.00000e+00, 1.18000e+03, 5.65000e+03,
1.00000e+00, 0.00000e+00, 0.00000e+00, 3.00000e+00,
7.00000e+00, 1.18000e+03, 0.00000e+00, 1.95500e+03,
0.00000e+00, 9.81780e+04, 4.75112e+01, -1.22257e+02,
1.34000e+03, 5.65000e+03]])
model.predict(X.iloc[0:1])
array([220542.34910198])
y[0]
221900.0
np.shape(X)
(21613, 18)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X) # returns an array of shapley values
shap_values2 = explainer(X) # this returns a dictionary containing
# shapley values, data values, base values. Convenient for plotting
print('shap shape', np.shape(shap_values))
shap shape (21600, 18)
# print data sample to be explained
sample = 10
print('data sample', sample)
data.iloc[sample,:]
data sample 10
price 662500.0000 bedrooms 3.0000 bathrooms 2.5000 sqft_living 3560.0000 sqft_lot 9796.0000 floors 1.0000 waterfront 0.0000 view 0.0000 condition 3.0000 grade 8.0000 sqft_above 1860.0000 sqft_basement 1700.0000 yr_built 1965.0000 renovated 0.0000 zipcode 98007.0000 lat 47.6007 long -122.1450 sqft_living15 2210.0000 sqft_lot15 8925.0000 Name: 10, dtype: float64
# show shap values for the predicted target (1)
shap_values2[sample]
.values =
array([ 6445.6665504 , 954.19074073, 177898.46191639, 9353.44427428,
2995.15805205, -6073.05016675, -10906.61842913, -11017.73548305,
-17174.70422995, 5308.78035444, -90282.66135448, -29318.29415725,
-2681.83657411, 6035.10626459, 139414.82771066, -31511.97459231,
10523.58385478, -5447.69598461])
.base_values =
536623.1089338156
.data =
array([ 3.00000e+00, 2.50000e+00, 3.56000e+03, 9.79600e+03,
1.00000e+00, 0.00000e+00, 0.00000e+00, 3.00000e+00,
8.00000e+00, 1.86000e+03, 1.70000e+03, 1.96500e+03,
0.00000e+00, 9.80070e+04, 4.76007e+01, -1.22145e+02,
2.21000e+03, 8.92500e+03])
print('Actual value = ', y[sample])
sample_pred = model.predict(X.iloc[sample:sample+1,:])
print('Predicted value = ', sample_pred)
Actual value = 662500.0 Predicted value = [691137.75768051]
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[sample,:],
X_test.iloc[sample,:])
shap.plots.waterfall(shap_values2[sample], max_display=9)
shap.summary_plot(shap_values, X, max_display=22)
shap.summary_plot(shap_values2, X,plot_type="bar", max_display=22)